a = read_csv('hotel_bookings.csv') %>%
clean_names() %>%
mutate(across(where(is.character), factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is.numeric)
) %>% filter(is_canceled == 0) #filter to non-canceled bookings
a$is_canceled = NULL
a$reservation_status_date = NULL#I don't really understand this var; will create an 'arrival.date' var for a date field
a$reservation_status_date = NULL
skimr::skim(a)| Name | a |
| Number of rows | 75166 |
| Number of columns | 30 |
| _______________________ | |
| Column type frequency: | |
| factor | 13 |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| agent | 0 | 1 | FALSE | 315 | 9: 18697, NUL: 12310, 240: 8438, 7: 3065 |
| arrival_date_month | 0 | 1 | FALSE | 12 | Aug: 8638, Jul: 7919, May: 7114, Oct: 6914 |
| assigned_room_type | 0 | 1 | FALSE | 10 | A: 41105, D: 18960, E: 5838, F: 2824 |
| company | 0 | 1 | FALSE | 332 | NUL: 69560, 40: 850, 223: 665, 45: 222 |
| country | 0 | 1 | FALSE | 166 | PRT: 21071, GBR: 9676, FRA: 8481, ESP: 6391 |
| customer_type | 0 | 1 | FALSE | 4 | Tra: 53099, Tra: 18735, Con: 2814, Gro: 518 |
| deposit_type | 0 | 1 | FALSE | 3 | No : 74947, Ref: 126, Non: 93 |
| distribution_channel | 0 | 1 | FALSE | 5 | TA/: 57718, Dir: 12088, Cor: 5203, GDS: 156 |
| hotel | 0 | 1 | FALSE | 2 | Cit: 46228, Res: 28938 |
| market_segment | 0 | 1 | FALSE | 7 | Onl: 35738, Off: 15908, Dir: 10672, Gro: 7714 |
| meal | 0 | 1 | FALSE | 5 | BB: 57800, HB: 9479, SC: 6684, Und: 883 |
| reservation_status | 0 | 1 | FALSE | 1 | Che: 75166, Can: 0, No-: 0 |
| reserved_room_type | 0 | 1 | FALSE | 9 | A: 52364, D: 13099, E: 4621, F: 2017 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| adr | 0 | 1 | 99.99 | 49.21 | -6.38 | 67.5 | 92.5 | 125 | 510 | <U+2587><U+2586><U+2581><U+2581><U+2581> |
| adults | 0 | 1 | 1.83 | 0.51 | 0.00 | 2.0 | 2.0 | 2 | 4 | <U+2581><U+2582><U+2587><U+2581><U+2581> |
| arrival_date_day_of_month | 0 | 1 | 15.84 | 8.78 | 1.00 | 8.0 | 16.0 | 23 | 31 | <U+2587><U+2587><U+2587><U+2587><U+2586> |
| arrival_date_week_number | 0 | 1 | 27.08 | 13.90 | 1.00 | 16.0 | 28.0 | 38 | 53 | <U+2586><U+2587><U+2587><U+2587><U+2586> |
| arrival_date_year | 0 | 1 | 2016.15 | 0.70 | 2015.00 | 2016.0 | 2016.0 | 2017 | 2017 | <U+2583><U+2581><U+2587><U+2581><U+2586> |
| babies | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.0 | 0.0 | 0 | 10 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| booking_changes | 0 | 1 | 0.29 | 0.74 | 0.00 | 0.0 | 0.0 | 0 | 21 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| children | 0 | 1 | 0.10 | 0.39 | 0.00 | 0.0 | 0.0 | 0 | 3 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| days_in_waiting_list | 0 | 1 | 1.59 | 14.78 | 0.00 | 0.0 | 0.0 | 0 | 379 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| is_repeated_guest | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.0 | 0.0 | 0 | 1 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| lead_time | 0 | 1 | 79.98 | 91.11 | 0.00 | 9.0 | 45.0 | 124 | 737 | <U+2587><U+2582><U+2581><U+2581><U+2581> |
| previous_bookings_not_canceled | 0 | 1 | 0.20 | 1.81 | 0.00 | 0.0 | 0.0 | 0 | 72 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| previous_cancellations | 0 | 1 | 0.02 | 0.27 | 0.00 | 0.0 | 0.0 | 0 | 13 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| required_car_parking_spaces | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.0 | 0.0 | 0 | 8 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| stays_in_week_nights | 0 | 1 | 2.46 | 1.92 | 0.00 | 1.0 | 2.0 | 3 | 50 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 0.99 | 0.00 | 0.0 | 1.0 | 2 | 19 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| total_of_special_requests | 0 | 1 | 0.71 | 0.83 | 0.00 | 0.0 | 1.0 | 1 | 5 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
# make arrival date var
a = a %>% mutate(
arrival.date = make_date(
year = arrival_date_year,
month = match(arrival_date_month, month.name),
day = arrival_date_day_of_month)
) %>% arrange(arrival.date)
### will use this for later on
a.anomaly = a
# these numeric vars s/b factor vars
a = a %>% mutate_at(vars(arrival_date_day_of_month, arrival_date_week_number, arrival_date_year, is_repeated_guest), factor)
# reordering df
a = a %>% select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is.numeric)
)paste(
'The date range of this dataset is from',
a %>% pull(arrival.date) %>% range %>% .[1],
'to',
a %>% pull(arrival.date) %>% range %>% .[2],
', just over 3 years of data.'
)## [1] "The date range of this dataset is from 2015-07-01 to 2017-08-31 , just over 3 years of data."
a %>% group_by(arrival.date, hotel) %>%
summarise(total.individuals = sum(adults, children, babies)) %>%
arrange(arrival.date) %>%
plot_ly(
x = ~arrival.date,
y = ~total.individuals,
color = ~hotel,
alphtrain = 0.7
) %>% layout(
title = 'total.individuals by date/hotel',
xaxis = list(title = ''),
yaxis = list(title = '')
) a %>% group_by(arrival.date, customer_type) %>%
summarise(total.individuals = sum(adults, children, babies)) %>%
arrange(arrival.date) %>%
plot_ly(
x = ~arrival.date,
y = ~total.individuals,
color = ~customer_type,
alphtrain = 0.7
) %>% layout(
title = 'total.individuals by date/customer_type',
xaxis = list(title = ''),
yaxis = list(title = '')
)a %>% group_by(arrival.date, deposit_type) %>%
summarise(total.individuals = sum(adults, children, babies)) %>%
arrange(arrival.date) %>%
plot_ly(
x = ~arrival.date,
y = ~total.individuals,
color = ~deposit_type,
alphtrain = 0.7
) %>% layout(
title = 'total.individuals by date/deposit_type',
xaxis = list(title = ''),
yaxis = list(title = '')
)a %>% group_by(arrival.date, distribution_channel) %>%
summarise(total.individuals = sum(adults, children, babies)) %>%
arrange(arrival.date) %>%
plot_ly(
x = ~arrival.date,
y = ~total.individuals,
color = ~distribution_channel,
alphtrain = 0.7
) %>% layout(
title = 'total.individuals by date/distribution_channel',
xaxis = list(title = ''),
yaxis = list(title = '')
)## Rows: 75,166
## Columns: 17
## $ agent <fct> NULL, NULL, NULL, 304, 240, 240, NULL, 30...
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ arrival_date_month <fct> July, July, July, July, July, July, July,...
## $ arrival_date_week_number <fct> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...
## $ arrival_date_year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
## $ assigned_room_type <fct> C, C, C, A, A, A, C, C, D, E, G, E, E, E,...
## $ company <fct> NULL, NULL, NULL, NULL, NULL, NULL, NULL,...
## $ country <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT, P...
## $ customer_type <fct> Transient, Transient, Transient, Transien...
## $ deposit_type <fct> No Deposit, No Deposit, No Deposit, No De...
## $ distribution_channel <fct> Direct, Direct, Direct, Corporate, TA/TO,...
## $ hotel <fct> Resort Hotel, Resort Hotel, Resort Hotel,...
## $ is_repeated_guest <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ market_segment <fct> Direct, Direct, Direct, Corporate, Online...
## $ meal <fct> BB, BB, BB, BB, BB, BB, BB, FB, HB, BB, H...
## $ reservation_status <fct> Check-Out, Check-Out, Check-Out, Check-Ou...
## $ reserved_room_type <fct> C, C, A, A, A, A, C, C, D, D, G, E, D, E,...
jpal = colorRampPalette(brewer.pal(8,'Dark2'))(15)
a %>% select(where(is.factor)) %>%
map(n_unique) %>%
as.tibble() %>%
pivot_longer(everything()) %>%
plot_ly(y = ~name, x = ~value, color = ~name, colors = jpal) %>%
add_bars() %>%
hide_legend() %>%
layout(
title = 'distribution of level counts per factor',
xaxis = list(title = ''),
yaxis = list(title = '')
)## $agent
## [1] NULL 304 240 303 241 8 250 115 5 253 6 156 175 134 243
## [16] 242 69 1 3 105 40 11 147 142 196 306 184 96 2 127
## [31] 128 95 143 13 7 146 27 177 26 244 149 22 15 261 17
## [46] 167 300 305 67 19 152 171 104 36 28 9 71 110 14 42
## [61] 258 20 181 66 45 114 34 29 37 57 61 16 88 10 39
## [76] 21 24 251 275 301 50 30 68 4 70 72 75 245 54 81
## [91] 248 89 208 183 74 52 86 201 12 44 92 256 31 314 83
## [106] 94 273 82 281 126 193 63 32 90 79 99 85 117 112 87
## [121] 185 47 106 98 111 330 324 334 328 326 182 321 332 313 38
## [136] 339 119 336 78 135 148 151 155 348 350 138 56 91 103 352
## [151] 121 158 168 335 308 118 159 139 195 180 355 315 363 144 153
## [166] 211 210 360 187 375 129 213 174 327 220 387 270 173 367 216
## [181] 154 298 384 232 64 35 23 58 385 393 406 205 157 249 405
## [196] 133 150 214 192 132 191 163 267 414 333 215 252 227 247 427
## [211] 278 219 307 430 431 429 420 280 426 285 433 289 269 438 436
## [226] 418 265 441 295 294 223 432 282 288 446 290 122 325 450 341
## [241] 434 310 454 234 455 344 77 59 368 451 346 468 464 359 283
## [256] 364 370 33 358 371 411 25 179 53 481 141 378 391 254 397
## [271] 331 469 165 416 404 299 73 467 197 510 354 440 444 296 337
## [286] 476 526 493 394 425 461 408 390 527 388 502 453 479 410 262
## [301] 508 459 229 474 475 423 480 484 497 535 302 531 495 509 449
## 334 Levels: 1 10 103 104 105 106 107 11 110 111 112 114 115 117 118 119 ... NULL
##
## $arrival_date_day_of_month
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
##
## $arrival_date_month
## [1] July August September October November December January
## [8] February March April May June
## 12 Levels: April August December February January July June March ... September
##
## $arrival_date_week_number
## [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [26] 52 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [51] 24 25 26
## 53 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 53
##
## $arrival_date_year
## [1] 2015 2016 2017
## Levels: 2015 2016 2017
##
## $assigned_room_type
## [1] C A D E G F I B H K
## Levels: A B C D E F G H I K L P
##
## $company
## [1] NULL 110 204 270 154 135 113 42 240 20 83 9 40 45 46
## [16] 103 38 144 108 47 49 307 268 51 62 67 82 200 195 68
## [31] 59 76 72 86 246 84 312 65 91 318 81 28 96 287 93
## [46] 174 319 274 100 31 223 286 159 292 281 92 116 115 101 105
## [61] 323 317 329 94 118 325 43 331 88 11 324 53 278 12 297
## [76] 137 139 80 16 337 142 338 127 130 342 120 148 343 346 347
## [91] 107 140 349 143 149 351 289 61 291 160 353 355 290 163 8
## [106] 54 85 99 250 269 358 362 192 361 14 366 372 277 365 109
## [121] 207 178 377 179 378 379 183 380 22 364 360 371 232 197 384
## [136] 203 167 185 219 217 388 212 209 390 180 391 215 376 193 392
## [151] 221 165 153 213 224 186 396 302 398 225 237 6 216 222 263
## [166] 370 399 230 169 367 234 397 35 400 369 401 402 403 227 251
## [181] 282 405 158 146 168 245 218 104 382 255 78 408 258 254 409
## [196] 259 260 413 238 407 34 10 272 257 106 271 18 419 275 415
## [211] 273 421 424 210 425 423 428 422 71 284 395 439 435 442 301
## [226] 233 305 445 293 264 311 288 304 308 313 320 448 150 443 314
## [241] 454 334 332 444 394 333 456 52 459 341 458 39 465 460 447
## [256] 352 470 356 243 466 73 368 457 482 350 484 184 485 330 32
## [271] 383 487 491 490 494 112 496 393 499 132 29 501 220 412 411
## [286] 420 504 410 511 426 507 417 514 506 498 242 515 516 512 126
## [301] 64 429 477 433 518 446 450 523 521 418 528 530 437 280 357
## [316] 479 478 483 534 436 489 229 486 481 539 525 497 541 520 455
## [331] 451 492
## 353 Levels: 10 100 101 102 103 104 105 106 107 108 109 11 110 112 113 ... NULL
##
## $country
## [1] PRT GBR USA ESP IRL FRA NULL ROU NOR OMN ARG DEU POL BEL BRA
## [16] ITA CHE CN GRC NLD DNK RUS SWE AUS EST CZE FIN AUT HUN ISR
## [31] MOZ BWA NZL LUX IDN SVN ALB MAR CHN HRV AGO BGR IND DZA MEX
## [46] TUN COL KAZ LVA STP UKR VEN TWN SMR KOR TUR BLR JPN PRI SRB
## [61] LTU CPV AZE LBN CRI CHL THA IRN SVK EGY CMR LIE MYS SAU ZAF
## [76] MKD MMR DOM IRQ SGP CYM ZMB PAN ZWE SEN NGA GIB ARM PER KNA
## [91] JOR KWT LKA GEO TMP ETH MUS ECU PHL CUB ARE BFA CYP KEN BIH
## [106] COM SUR JAM MCO GNB RWA LBY UGA TZA CIV SYR MLI BGD ISL BHR
## [121] NAM BOL BDI MLT URY PAK SYC QAT PRY BRB ABW AND VNM AIA SLV
## [136] PLW DMA GAB CAF PYF LCA GUY ATA MDV MWI MNE GTM MDG GHA ASM
## [151] TGO UZB NPL MRT BHS HKG NCL MAC KIR SDN ATF TJK DJI SLE LAO
## [166] FRO
## 178 Levels: ABW AGO AIA ALB AND ARE ARG ARM ASM ATA ATF AUS AUT AZE BDI ... ZWE
##
## $customer_type
## [1] Transient Contract Transient-Party Group
## Levels: Contract Group Transient Transient-Party
##
## $deposit_type
## [1] No Deposit Refundable Non Refund
## Levels: No Deposit Non Refund Refundable
##
## $distribution_channel
## [1] Direct Corporate TA/TO Undefined GDS
## Levels: Corporate Direct GDS TA/TO Undefined
##
## $hotel
## [1] Resort Hotel City Hotel
## Levels: City Hotel Resort Hotel
##
## $is_repeated_guest
## [1] 0 1
## Levels: 0 1
##
## $market_segment
## [1] Direct Corporate Online TA Offline TA/TO Groups
## [6] Complementary Aviation
## 8 Levels: Aviation Complementary Corporate Direct Groups ... Undefined
##
## $meal
## [1] BB FB HB SC Undefined
## Levels: BB FB HB SC Undefined
##
## $reservation_status
## [1] Check-Out
## Levels: Canceled Check-Out No-Show
##
## $reserved_room_type
## [1] C A D G E F H L B
## Levels: A B C D E F G H L P
## Rows: 75,166
## Columns: 13
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 35, 6...
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, ...
## $ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 0, 3, 1, 0, ...
jquantiles = function(col){quantile(col, probs = c(0.90, 0.95, 0.99, 1))}
a %>% na.omit %>% select(where(is.numeric)) %>%
map(.x = . , jquantiles) %>%
as.data.frame.list() %>%
rownames_to_column() %>%
as.tibble()## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced
#no extreme right tailed outliers
a %>% select(where(is.numeric)) %>% filter(
adr != 510,
adults != 55,
babies != 10,
booking_changes != 21,
children != 10,
days_in_waiting_list != 391,
lead_time != 709,
previous_bookings_not_canceled != 72,
previous_cancellations != 26,
required_car_parking_spaces != 8,
stays_in_week_nights != 50,
stays_in_weekend_nights != 19
) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)#no outliers
a %>% select(hotel, where(is.numeric)) %>% filter(
adr != 510,
adults != 55,
babies != 10,
booking_changes != 21,
children != 10,
days_in_waiting_list != 391,
lead_time != 709,
previous_bookings_not_canceled != 72,
previous_cancellations != 26,
required_car_parking_spaces != 8,
stays_in_week_nights != 50,
stays_in_weekend_nights != 19
) %>% DataExplorer::plot_boxplot(by = 'hotel', nrow = 3, ncol = 1)#check: check number of bookings by combination of adult and children counts
a %>% count(adults, children)a %>% count(adults, children) %>% filter(adults == 0) %>% summarise(total.bookings.without.adults = sum(n))a %>% count(adults, children) %>% filter(children == 0) %>% summarise(total.bookings.without.children = sum(n))a %>% count(adults, children) %>% filter(children == 0, adults == 0) %>% summarise(total.bookings.without.adults.and.children = sum(n))paste(
scales::percent(
nrow(a %>% filter(adults > 0, children == 0)) / nrow(a)
),
'of all observations have 1 or more adults and zero children.'
)## [1] "93% of all observations have 1 or more adults and zero children."
a %>% filter(adults == 0, children >0) %>% mutate(adr.per.child = adr/children) %>% plot_ly(x = ~adr.per.child) %>% add_boxplot() %>% layout('ADR distribution per child')#creating adr per adult var
a.adult.adrs = a.adult.adrs %>% mutate(adr.per.adult = adr/adults)
a.adult.adrs %>%
plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~hotel, colors = jpal[1:2]) %>%
add_boxplot() %>%
layout(
title = 'ADR per Adult by Hotel Type',
xaxis = list(title = ''),
yaxis = list(title = '')
)#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~customer_type, colors = jpal, group = ~customer_type) %>%
add_boxplot() %>%
layout(
boxmode = 'group', #SUPER IMPORTANT
title = 'ADR per Adult by Hotel/customer_type'
) #https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
plot_ly(y = ~hotel, x = ~adr, color = ~market_segment, colors = jpal, group = ~market_segment) %>%
add_boxplot() %>%
layout(
boxmode = 'group', #SUPER IMPORTANT
title = 'ADR per Adult by Hotel/market_segment'
) #https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
plot_ly(x = ~hotel, y = ~adr, color = ~arrival_date_month, colors = jpal, group = ~arrival_date_month) %>%
add_boxplot() %>%
layout(
boxmode = 'group', #SUPER IMPORTANT
title = 'ADR per Adult by Hotel/arrival_date_month',
hoverformat = '.0f'
) ##
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
##
## %like%
Observations
a %>% select(
hotel,
customer_type
) %>% GGally::ggpairs(
mapping = aes(color = hotel)
) + scale_fill_manual(values = c('darkblue','darkorange'))#times series dataset
ts = a %>% na.omit %>% filter(
adults > 0,
children == 0
) %>% mutate(
arrival.date = lubridate::make_date(
year = arrival_date_year,
month = match(arrival_date_month, month.name),
day = arrival_date_day_of_month
)) %>% mutate(
arrival.week = floor_date(arrival.date, 'week', week_start = 7)
) %>% group_by(arrival.week, hotel) %>%
summarise(total.adult.bookings = sum(adults), .groups = 'drop') %>%
arrange(arrival.week)
#---------------------------- city
ts.city = ts %>% filter(hotel == 'City Hotel')
ts.city$hotel = NULL
ts.city.split = initial_time_split(ts.city)
ts.city.a = training(ts.city.split)
ts.city.test = testing(ts.city.split)
#---------------------------- resort
ts.resort = ts %>% filter(hotel == 'Resort Hotel')
ts.resort$hotel = NULL
ts.resort.split = initial_time_split(ts.resort)
ts.resort.a = training(ts.resort.split)
ts.resort.test = testing(ts.resort.split)library(anomalize)
# time_decompose(data, target, method = c("stl", "twitter"), frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# The alpha parameter adjusts the width of the critical values. By default, alpha = 0.05.
# Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
# The STL method uses the stl() function from the stats package. STL works very well in circumstances where a long term trend is present (which applies in this case; see trend component in the prophet graphs below').
(ts.city.a %>% as.tibble() %>%
time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
time_recompose())ggplotly(
ts.city.a %>% as.tibble() %>%
time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
time_recompose() %>%
plot_anomalies(
ncol = 2,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 2,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'total.bookings', title = 'city hotel total.adult.bookings')
)#----------------------------
(ts.resort.a %>% as.tibble() %>%
time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
time_recompose())ggplotly(
ts.resort.a %>% as.tibble() %>%
time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
time_recompose() %>%
plot_anomalies(
ncol = 2,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 2,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'total.bookings', title = 'resort hotel total adults')
)## Loading required package: Rcpp
##
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
##
## populate
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
## The following object is masked from 'package:data.table':
##
## :=
# https://dygraphs.com/options.html
#renaming cols to prophet's col conventions
prophet.city.df = ts.city.a %>% select(ds = arrival.week, y = total.adult.bookings)
#creating model
prophet.city.mdl = prophet.city.df %>% prophet(yearly.seasonality = TRUE)## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.city.future.df = prophet.city.mdl %>% make_future_dataframe(
periods = 4, #next 4 wks
freq = 'week',
include_history = TRUE
)
#make forecasts df
prophet.city.forecast.df = prophet.city.mdl %>% predict(prophet.city.future.df)
prophet.city.forecast.df %>% head %>% DT::datatable()#plot forecast
prophet.city.mdl %>% dyplot.prophet(
prophet.city.forecast.df,
main = '<h style="color: black; font-size:18px;">City Hotel: Total Adults 4 Week Prediction</h>'
) %>%
dygraphs::dyOptions(
colors = c('darkgreen','blue'),
pointSize = 2,
)## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
(prophet.city.preds.truth = tibble(
arrival.date = ts.city.test %>% pull(arrival.week) %>% head(4),
truth = ts.city.test %>% pull(total.adult.bookings) %>% head(4),
estimate = prophet.city.forecast.df %>% pull(yhat) %>% tail(4)
))bind_rows(
prophet.city.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.city.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)library(prophet)
# https://dygraphs.com/options.html
#renaming cols to prophet's col conventions
prophet.resort.df = ts.resort.a %>% select(ds = arrival.week, y = total.adult.bookings)
#creating model
prophet.resort.mdl = prophet.resort.df %>% prophet(yearly.seasonality = TRUE)## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.resort.future.df = prophet.resort.mdl %>% make_future_dataframe(
periods = 4, #next 4 wks
freq = 'week',
include_history = TRUE
)
#make forecasts df
prophet.resort.forecast.df = prophet.resort.mdl %>% predict(prophet.resort.future.df)
prophet.resort.forecast.df %>% head %>% DT::datatable()#plot forecast
prophet.resort.mdl %>% dyplot.prophet(
prophet.resort.forecast.df,
main = '<h style="color: black; font-size:18px;">Resort Hotel: Total Adults 4 Week Prediction</h>'
) %>%
dygraphs::dyOptions(
colors = c('darkorange','blue'),
pointSize = 2,
)#plot forecast components
prophet.resort.mdl %>% prophet_plot_components(prophet.resort.forecast.df)(prophet.resort.preds.truth = tibble(
arrival.date = ts.resort.test %>% pull(arrival.week) %>% head(4),
truth = ts.resort.test %>% pull(total.adult.bookings) %>% head(4),
estimate = prophet.resort.forecast.df %>% pull(yhat) %>% tail(4)
))bind_rows(
prophet.resort.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.resort.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14
#a recipe is used for preprocessing
recipe = train %>% recipe(adr ~ . ) %>%
#remove vars with low or now correlation
step_corr(all_numeric(),-all_outcomes()) %>%
#remove vars with low or no variance
step_nzv(all_numeric(),-all_outcomes()) %>%
step_zv(all_numeric(),-all_outcomes()) %>%
#----------------------------
#reduce number of levels for factors with many, many levels
step_other(agent, company, country) %>% #default threshold of 0.05
#----------------------------
step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>% #prob not necessary b/c both vars are already scaled the same, but doing so anyway to develop good habits
step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
tidy(recipe)## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 30
##
## Training data contained 11275 data points and no missing data.
##
## Operations:
##
## Correlation filter removed no terms [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]
#'Using the recipe, prepare & bake the train ds'
baked.train = recipe %>% prep() %>% bake(new_data = NULL) %>%
select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))
#'Using the recipe, prepare & bake the test ds'
baked.test = recipe %>% prep() %>% bake(new_data = test) %>%
select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))
baked.train %>% head() %>% DT::datatable()## $agent
## [1] "240" "9" "NULL" "other"
##
## $company
## [1] "NULL" "other"
##
## $country
## [1] "DEU" "ESP" "FRA" "GBR" "PRT" "other"
## $agent
## # A tibble: 4 x 2
## f n
## <fct> <int>
## 1 other 5368
## 2 9 2771
## 3 NULL 1850
## 4 240 1286
##
## $company
## # A tibble: 2 x 2
## f n
## <fct> <int>
## 1 NULL 10420
## 2 other 855
##
## $country
## # A tibble: 6 x 2
## f n
## <fct> <int>
## 1 other 3580
## 2 PRT 3197
## 3 GBR 1375
## 4 FRA 1258
## 5 DEU 936
## 6 ESP 929
tibble(
abbreviation = c('rec','mdl','wf','tg'),
term = c('recipe','model','workflow','tune_grid'),
description = c('executes necessary preprocessing steps on the features','self explanatory','combines the recipe and model','executes on vfolds using the workflow, and optional hyperparameter grid')
) %>% DT::datatable()#hyperparameters with a value of 'tune()' are those we will 'tune'
rf.mdl = parsnip::rand_forest(
trees = 300,
min_n = tune(), #min number of observations at terminal node
mtry = tune() #number of vars to randomly subset at each node
) %>%
set_mode('regression') %>%
set_engine('ranger', importance = 'impurity_corrected')doParallel::registerDoParallel() #use parallel processing
set.seed(345)
rf.tg = tune_grid(
rf.wf,
resamples = vfolds,
grid = 5) #Create a tuning grid AUTOMATICALLY## i Creating pre-processing data to finalize unknown parameter: mtry
ggplotly(
rf.tg %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "RMSE")
)## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
rf.tg %>%
collect_metrics() %>%
filter(.metric == 'rmse') %>%
select(mean, mtry, min_n, .config) %>%
arrange(mean)doParallel::registerDoParallel() #use parallel processing
set.seed(345)
rf.tg = tune_grid(
rf.wf,
resamples = vfolds,
grid = rf.gr) #Create a tuning grid MANUALLY
rf.tgggplotly(
rf.tg %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "RMSE")
)rf.tg %>%
collect_metrics() %>%
filter(.metric == 'rmse') %>%
select(mean, mtry, min_n, .config) %>%
arrange(mean)#create final workflow
rf.wf.fin = workflow() %>%
add_recipe(recipe) %>%
add_model(rf.mdl.fin)
#fit on ENTIRE train, execute on test, evaluate results
rf.res.final = rf.wf.fin %>%
last_fit(split)
#check results
rf.res.final %>% collect_metrics()doParallel::registerDoParallel() #use parallel processing
#!!<NOTE> to analyze var importance, you need an importance arg
# The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance.
model.rf = ranger::ranger(adr ~ . , data = baked.train, num.trees = 300, importance = 'impurity_corrected')
model.rf
#viz var importance
model.rf.var.imp = model.rf$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)
model.rf.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()
#make predictions
model.rf.preds = as.vector(predict(model.rf, baked.test, seed = 345)$predictions)
#y, pred
model.rf.df = tibble(y = baked.test$adr, preds = model.rf.preds)
#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
#what is rmse and mae?
bind_rows(
rmse = model.rf.df %>% yardstick::rmse(truth = y, estimate = preds),
mae = model.rf.df %>% yardstick::mae(truth = y, estimate = preds)
)
#compare relative performance
#what does mae look like rel. to sd of response var?
paste("test set's standard deviation is: ", round(sd(test$adr),1))Observations